In [25]:
from bs4 import BeautifulSoup
from slugify import slugify
from urllib import request
from time import strptime, strftime
import argparse
import shelve
import pickle
import time
import os
BASE_PATH = os.path.dirname(os.path.abspath('__file__'))
DOWNLOADS_PATH = os.path.join(BASE_PATH, 'downloads')
if not os.path.isdir(DOWNLOADS_PATH):
print ('CREATING DOWNLOADS_PATH ({})'.format(DOWNLOADS_PATH))
os.mkdir(DOWNLOADS_PATH)
data_download_path = os.path.join(DOWNLOADS_PATH, "lol_data")
VERBOSE = False
In [26]:
def get_page(url):
""" loads a webpage into temporary file to a string and read it and returns the string"""
local_filename = "01.html"
header = ""
html = ""
try:
local_filename, headers = request.urlretrieve(url)
print( "Capturing page:", url)
except Exception as e:
print ("ERROR 405", e)
return 0
html = open(local_filename, encoding="utf-8")
if VERBOSE:
print ("oppening file", local_filename, header,)
return html
In [59]:
class Team(object):
def __init__(self, team_id="", name="", country="", rank=0, rating=0, url=""):
self.id_ = team_id
self.name = name
self.country = country
self.rank = rank
self.rating = rating
self.url = url
def get_entry(self):
return "{0} {1} from {2}, is rank {3} and rating {4}".format(self.id_, self.name, self.country, self.rank, self.rating)
entry = property(get_entry)
class Tournament(Team):
def __init__(self, div, date, TeamBlue, TeamPurple, MPB, MPP, MR, TRB, TRP, FRR, RB, RP, W10B, L10B, W10P, L10P, HWB, HWP, GBB, GBP, BO, url):
"""
MPB/MPP = Match Points Blue/Purple
MR = Match Result
TRB/TRP = Team Rank Blue/Purple
FRR = First Round Result (B/P)
RB/RP = Rating of team Blue/Purple
W10B/W10P = Wins in the last 10 months team Blue/Purple
L10B/L10P = Losses in the last 10 months team Blue/Purple
HWB/HWP = Historic Wins by Blue/Purple against this opponent
GBB/GBP = GosuBet on Blue/Purple
BO = Best of 1, 3 or 5
url = url of the match
"""
self.div = div
self.date = date
self.TeamBlue = TeamBlue
self.TeamPurple = TeamPurple
self.MPB = MPB
self.MPP = MPP
self.MR = MR
self.TRB = TRB
self.TRP = TRP
self.FRR = FRR
self.RB = RB #team rating
self.RP = RP #team rating
self.W10B = W10B
self.L10B = L10B
self.W10P = W10P
self.L10P = L10P
self.HWB = HWB
self.HWP = HWP
self.GBB = GBB
self.GBP = GBP
self.BO = BO
self.url = url
def get_entry(self):
return {"Tournament{0}, {1} vs {2}.".format(self.div, self.TeamBlue, self.TeamPurple)}
def get_results(self):
if self.MR is "B":
return {"Winner is {0}".format(self.TeamBlue)}
else:
return {"Winner is {0}".format(self.TeamPurple)}
entry = property(get_entry)
result = property(get_results)
class TeamData(Team):
def __init__(self, team_name, team_country, rank, rating, wins_last, losses_last, wins, draws, losses, players, heroes):
self.team_name = team_name
self.team_country = team_country
self.rank = rank
self.rating = rating
self.wins_last = wins_last
self.losses_last = losses_last
self.balance = wins_last-losses_last
self.wins = wins
self.draws = draws
self.losses = losses
if (wins+draws+losses) > 0:
self.win_percentage = (100.0 / (wins+draws+losses)) *wins
self.draw_percentage = (100.0 / (wins+draws+losses)) *draws
self.losses_percentage = (100.0 / (wins+draws+losses)) *losses
else:
self.win_percentage = 0.0
self.draw_percentage = 0.0
self.losses_percentage = 0.0
self.matches_played = wins+draws+losses
self.players = players
self.heroes = heroes
def get_entry(self):
return {"{0}, from {1}, has {2} players. Rank: {3} and rating {4}.".format(self.team_name, self.team_country, len(self.players), self.rank, self.rating)}
def get_results(selft):
return {"Win% {0:.2f}, draw% {1:.2f}, losses% {2:.2f}, matches played {3}".format(self.win_percentage, self.draw_percentage, self.losses_percentage, self.matches_played)}
def get_chance(self):
return {"win": self.win_percentage, "draw": self.draw_percentage, "losses": self.losses_percentage, "matches": self.matches_played}
def get_chance10(self):
return {"win":(100 / (self.wins_last+self.losses_last)) * self.wins_last , "losses": (100 / (self.wins_last+self.losses_last)) * self.losses_last, "matches": self.wins_last+self.losses_last}
def get_number_of_players(self):
return len(self.players)
number_of_players = property(get_number_of_players)
entry = property(get_entry)
chance = property(get_chance)
chance10 = property(get_chance10)
In [60]:
def save_obj(obj, name ):
with open(DOWNLOADS_PATH + "/" + name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open(DOWNLOADS_PATH + "/" + name + '.pkl', 'rb') as f:
return pickle.load(f)
In [61]:
def getData(url_root, url, callback=True):
# first it will drop "http[s]://" and "index.html", if present:
*rest, page_url = url.split(url_root)
if VERBOSE:
print( "page url:", page_url)
if not os.path.isdir(data_download_path):
os.mkdir(data_download_path)
if VERBOSE:
print ('CREATING data_download_path ({})'.format(data_download_path))
resp = get_page(url_root+url)
soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8")
links = []
teams_set = {}
try:
teams_set = load_obj ("data_11")
except:
print('nothing to load')
if callback:
for link in soup.table.find_all("a"):
links.append(link.get('href'))
links = list(set(links))
links.remove(page_url)
if len(links):
for link in links:
print (link)
rank = soup.table.find_all("tr")
for entry in rank[:-1]:
try:
TEAM_ID = entry["data-id"]
TEAM_NAME = entry.find_all('span')[2].getText()
TEAM_COUNTRY = entry.find_all('span')[1]['title']
TEAM_RANK = int(entry.find('div').getText())
TEAM_RATING = int(entry.find("td", class_="numbers").getText().replace(",",""))
print (TEAM_ID, TEAM_NAME + ",", TEAM_COUNTRY,"-", TEAM_RANK, TEAM_RATING)
t_url = "http://www.gosugamers.net/lol/teams/"+entry["data-id"]+"-"+slugify(TEAM_NAME)
teams_set[TEAM_ID] = Team(TEAM_ID, TEAM_NAME, TEAM_COUNTRY, TEAM_RANK, TEAM_RATING, t_url)
print (teams_set[TEAM_ID].url)
except Exception as e:
print ("ERROR 400", e)
print ("set:", len(teams_set))
save_obj( teams_set, "data_11")
if len(links):
for link in links:
getData(url_root, link, False)
In [62]:
def getTeam(url_root, url, id_):
print( "team id:", id_)
if not os.path.isdir(data_download_path):
os.mkdir(data_download_path)
if VERBOSE:
print ('CREATING data_download_path ({})'.format(data_download_path))
resp = get_page(url)
if (resp == 0):
print("ERROR 402", id_)
return 0
soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8")
team_data = {}
try:
team_data = load_obj ("data_12")
if type(team_data[id_].wins) is not None:
if VERBOSE:
print("data loaded")
else:
if VERBOSE:
print("data empty")
except:
if VERBOSE:
print("nothing to load")
try:
score = soup.table.find_all("span", class_='score')
if VERBOSE:
print ("Team Stats:", score[-3].getText(), score[-2].getText(), score[-1].getText())
#win_lose rate
gameScore = soup.find("div", class_="months-wrap")
wins = 0
losses = 0
rank_ = 0
for i in gameScore.find_all("div", class_="wins"):
wins += int(i['style'].split(":")[1].split("px;")[0]) / 12
if VERBOSE:
print ("wins %i" % wins)
for i in gameScore.find_all("div", class_="losses"):
losses += int(i['style'].split(":")[1].split("px;")[0]) / 12
if VERBOSE:
print ("losses %i" % losses)
header = soup.find("div", class_="teamNameHolder")
t_name = header.h1.getText().rstrip().lstrip().split(" - ")[0]
t_country = header.div.span["title"]
rating_ = int(soup.find_all("span", class_="tooltip")[0].getText().replace(",",""))
print (rating_)
try:
rank_ = int(soup.find_all("span", class_="number")[1].getText())
except:
rank_ = int(soup.find_all("span", class_="number")[0].getText())
if rank_ == 0:
return 0
if VERBOSE:
print (t_name, t_country, rank, rating)
PLAYER_ROSTER = soup.find_all('div', class_="roster")
#pegar match history pelo site, considerando que cada 12 pixels das barras são 1 vitória ou derrota
for a in PLAYER_ROSTER:
j = a.find_all("a", class_="player")
player = []
heroes = []
for i in j:
try:
heroes.append(i.find_all("img", class_="icon")[0]['alt'])
heroes.append(i.find_all("img", class_="icon")[1]['alt'])
except:
pass
player.append(i['href'])
heroes = list(set(heroes))
team_data[id_] = TeamData(
team_name = t_name,
team_country = t_country,
rank = rank_,
rating = rating_,
wins_last = wins,
losses_last = losses,
wins = int(score[-3].getText()),
draws = int(score[-2].getText()),
losses = int(score[-1].getText()),
heroes = heroes,
players = player
)
if VERBOSE:
print(team_data[id_].entry)
#Com o player, devo pegar seus resultados com herois
#players = PLAYER_ROSTER.find_all("a", class_="player").getText()
except Exception as e:
print ("ERROR 401", id_, e)
return 0
print(team_data[id_].entry)
save_obj( team_data, "data_12")
return -1
In [63]:
def getTeamData(url):
if not os.path.isdir(data_download_path):
os.mkdir(data_download_path)
if VERBOSE:
print ('CREATING data_download_path ({})'.format(data_download_path))
resp = get_page(url)
if (resp == 0):
print("getTeamData - ERROR 402", url)
return 0
soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8")
team_data = {}
try:
score = soup.table.find_all("span", class_='score')
if VERBOSE:
print ("Team Stats:", score[-3].getText(), score[-2].getText(), score[-1].getText())
#win_lose rate
gameScore = soup.find("div", class_="months-wrap")
wins = 0
losses = 0
rank_ = 0
for i in gameScore.find_all("div", class_="wins"):
wins += int(i['style'].split(":")[1].split("px;")[0]) / 12
if VERBOSE:
print ("wins %i" % wins)
for i in gameScore.find_all("div", class_="losses"):
losses += int(i['style'].split(":")[1].split("px;")[0]) / 12
if VERBOSE:
print ("losses %i" % losses)
header = soup.find("div", class_="teamNameHolder")
t_name = header.h1.getText().rstrip().lstrip().split(" - ")[0]
t_country = header.div.span["title"]
rating_ = int(soup.find_all("span", class_="tooltip")[0].getText().replace(",",""))
print (rating_)
try:
rank_ = int(soup.find_all("span", class_="number")[1].getText())
except:
rank_ = int(soup.find_all("span", class_="number")[0].getText())
if rank_ == 0:
return 0
if VERBOSE:
print (t_name, t_country, rank_, rating_)
PLAYER_ROSTER = soup.find_all('div', class_="roster")
#pegar match history pelo site, considerando que cada 12 pixels das barras são 1 vitória ou derrota
for a in PLAYER_ROSTER:
j = a.find_all("a", class_="player")
player = []
heroes = []
for i in j:
try:
heroes.append(i.find_all("img", class_="icon")[0]['alt'])
heroes.append(i.find_all("img", class_="icon")[1]['alt'])
except:
pass
player.append(i['href'])
heroes = list(set(heroes))
team_data = TeamData(
team_name = t_name,
team_country = t_country,
rank = rank_,
rating = rating_,
wins_last = wins,
losses_last = losses,
wins = int(score[-3].getText()),
draws = int(score[-2].getText()),
losses = int(score[-1].getText()),
heroes = heroes,
players = player
)
if VERBOSE:
print(team_data.entry)
#Com o player, devo pegar seus resultados com herois
#players = PLAYER_ROSTER.find_all("a", class_="player").getText()
except Exception as e:
print ("ERROR 401", e)
return 0
return {"wins": team_data.wins_last, "losses": team_data.losses_last, "rating": team_data.rating ,"heroes" : team_data.heroes, "players": team_data.players, "Total_Wins": team_data.wins, "Total_Losses": team_data.losses}
In [64]:
#team = load_obj ("data_12")
In [69]:
def Tournament_Data(url, event_name, DEV = True, year = 2016):
#Will capture information from game chance
tourney_data = {}
#event_name = url.split("/")[-1]
URL_ROOT = "http://www.gosugamers.net"
try:
tourney_data = load_obj ("tournament_data")
if len(tourney_data):
if VERBOSE:
print("data loaded")
else:
if VERBOSE:
print("data empty")
except:
if VERBOSE:
print("DATA NOT FOUND")
print ("Tournament_data - looking for url:", url)
resp = get_page(url)
print (resp)
if (resp == 0):
print("ERROR 404", url)
return 0
soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8")
all_urls = []
for link in soup.find("table", class_="schedule").find_all('a'):
all_urls.append(link.get('href'))
i = 0
all_urls = list(set(all_urls))
for link in all_urls:
if "tournament" in link:
if VERBOSE:
print (URL_ROOT+link)
res = get_page(URL_ROOT+link)
if (res == 0):
print("ERROR 402", event_name)
return 0
soup_match = BeautifulSoup(res, "lxml", from_encoding="UTF-8")
#div
div = event_name
#TeamBlue TeamPurple
TeamBlue = soup_match.find("div", class_="opponent1").h3.getText()
TeamPurple = soup_match.find("div", class_="opponent2").h3.getText()
#date
date = soup_match.find("p", class_="datetime").getText()
date = str(year) + " " + date.strip()[:-5]
date = strptime(date, "%Y %B %d, %A, %H:%M") #"January 16, Saturday, 21:00 CEST"
#MPB MPP
MPB = soup_match.find("span", class_="hidden results btn-2").find_all("span")[0].getText()
MPP = soup_match.find("span", class_="hidden results btn-2").find_all("span")[1].getText()
#TRB TRP
TRB = int(soup_match.find_all("p", class_="ranked")[0].getText().replace("Ranked #", ""))
TRP = int(soup_match.find_all("p", class_="ranked")[1].getText().replace("Ranked #", ""))
#MR
MR = "D"
if MPB > MPP:
MR = "B"
elif MPP > MPB:
MR = "P"
#FRR
FRR = MR #can't get it right now
#Historic matches is a little bit adhoc
home = len(soup_match.find_all("div", class_="matchup away"))
away = len(soup_match.find_all("div", class_="matchup home"))
HWB = (100/(home+away))*home
HWP = (100/(home+away))*away
print("Hist B/P:", HWB, HWP)
#GosuBet
GBB = float(soup_match.find("div", class_="bet-opp1").span['val'])
GBP = float(soup_match.find("div", class_="bet-opp2").span['val'])
if (GBB < 1) and (GBP < 1):
GBB = HWB
GBP = HWP
#I need data from other games, so I'll take the "all time winrate" and last two months win-rate
a = soup_match.find("div", class_="opponent1")
opp_1 = a.find("a")["href"]
a = soup_match.find("div", class_="opponent2")
opp_2 = a.find("a")["href"]
TEAM_BLUE = getTeamData(URL_ROOT+opp_1)
TEAM_PURPLE = getTeamData(URL_ROOT+opp_2)
#print (TEAM_BLUE)
#print (TEAM_PURPLE)
RB = TEAM_BLUE["rating"]
RP = TEAM_PURPLE["rating"]
W10B = TEAM_BLUE["wins"]
L10B = TEAM_PURPLE["losses"]
W10P = TEAM_BLUE["wins"]
L10P = TEAM_PURPLE["losses"]
#Best of!
BO = int(soup_match.find("p", class_="bestof").getText().replace("Best of ", ""))
if VERBOSE:
print (TRB, TeamBlue, MPB ,"vs", MPP, TRP, TeamPurple, "- First round winner is", FRR, "Odds are", GBB, GBP, "best of", BO)
tourney_data[i] = Tournament(
div, date, TeamBlue, TeamPurple, MPB, MPP,
MR, TRB, TRP, FRR, RB, RP, W10B, L10B,
W10P, L10P, HWB, HWP, GBB, GBP, BO,
"http://www.gosugamers.net"+link
)
i = i + 1
if not DEV:
save_obj( tourney_data, "data_13")
if VERBOSE:
print ("fin")
VERBOSE = True
Tournament_Data("http://www.gosugamers.net/lol/events/431-2016-na-lcs-spring", "lcs-s-16", DEV = False)
In [70]:
import csv
with open('data/lol-2016/lcs-s-16.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
a = load_obj("data_13")
fieldnames = [
'div', 'data', 'TeamBlue', 'TeamPurple', 'MPB', 'MPP',
'MR', 'TRB', 'TRP', 'FRR', 'RB', 'RP', 'W10B', 'L10B',
'W10P', 'L10P', 'HWB', 'HWP', 'GBB', 'GBP', 'BO', 'url'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for data in a:
writer.writerow({
'div': a[data].div, 'data': strftime("%Y-%m-%d", a[data].date), 'TeamBlue': a[data].TeamBlue,
'TeamPurple': a[data].TeamPurple, 'MPB': a[data].MPB, 'MPP': a[data].MPP,
'MR': a[data].MR, 'TRB': a[data].TRB, 'TRP': a[data].TRP, 'FRR': a[data].FRR,
'RB': a[data].RB, 'RP': a[data].RP, 'W10B': a[data].W10B, 'L10B': a[data].L10B,
'W10P': a[data].W10P, 'L10P': a[data].L10P, 'HWB': a[data].HWB, 'HWP': a[data].HWP,
'GBB': a[data].GBB, 'GBP': a[data].GBP, 'BO': a[data].BO, 'url': a[data].url
})
print ("+")
print ("fin")
In [ ]:
feature_columns = ['MPB', 'MPP',
'MR', 'TRB', 'TRP', 'RB', 'RP', 'W10B', 'L10B',
'W10P', 'L10P', 'HWB', 'HWP', 'GBB', 'GBP']